home *** CD-ROM | disk | FTP | other *** search
- /*
- File: VWavelet_int.c
-
- Contains: Velocity Engine integer implementation Daubechies D4 wavelet code
-
- Copyright: Copyright © 2000 by Apple Computer, Inc., All Rights Reserved.
-
- You may incorporate this Apple sample source code into your program(s) without
- restriction. This Apple sample source code has been provided "AS IS" and the
- responsibility for its operation is yours. You are not permitted to redistribute
- this Apple sample source code as "Apple sample source code" after having made
- changes. If you're going to re-distribute the source, we require that you make
- it clear in the source that the code was descended from Apple sample source
- code, but that you've made changes.
-
-
-
- */
- /*
- Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Computer, Inc.
- ("Apple") in consideration of your agreement to the following terms, and your
- use, installation, modification or redistribution of this Apple software
- constitutes acceptance of these terms. If you do not agree with these terms,
- please do not use, install, modify or redistribute this Apple software.
-
- In consideration of your agreement to abide by the following terms, and subject
- to these terms, Apple grants you a personal, non-exclusive license, under Apple’s
- copyrights in this original Apple software (the "Apple Software"), to use,
- reproduce, modify and redistribute the Apple Software, with or without
- modifications, in source and/or binary forms; provided that if you redistribute
- the Apple Software in its entirety and without modifications, you must retain
- this notice and the following text and disclaimers in all such redistributions of
- the Apple Software. Neither the name, trademarks, service marks or logos of
- Apple Computer, Inc. may be used to endorse or promote products derived from the
- Apple Software without specific prior written permission from Apple. Except as
- expressly stated in this notice, no other rights or licenses, express or implied,
- are granted by Apple herein, including but not limited to any patent rights that
- may be infringed by your derivative works or by other works in which the Apple
- Software may be incorporated.
-
- The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
- WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
- WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN
- COMBINATION WITH YOUR PRODUCTS.
-
- IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
- GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION
- OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT
- (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN
- ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
- #ifdef __MWERKS__
- #include <altivec.h>
- #endif
-
- #include <MacTypes.h>
- #include <MacMemory.h>
-
- #include "vWavelet_int.h"
-
- typedef signed long DATA_TYPE;
-
- #define DO_HORIZ 1
- #define DO_VERT 1
-
- #define INT_SHIFT_AMOUNT 0
-
- #define H0 (11 * (1<< INT_SHIFT_AMOUNT))
- #define H1 (19 * (1<< INT_SHIFT_AMOUNT))
- #define H2 (5 * (1<< INT_SHIFT_AMOUNT))
- #define H3 (-3 * (1<< INT_SHIFT_AMOUNT))
-
- #define INT_WAVELET_SHIFT_AMOUNT 4
- #define INVERSE_WAVELET_SHIFT_AMOUNT 5
-
- #pragma mark F O R W A R D W A V E L E T
-
- ////////////////////////////////////////////////////////////////////////////
- // vFWVT_4_Quad16_Int_Vertical
- //
- // performs a forward vertical wavelet transform
- //
- // pSrc: source pixels (16-bit interleaved RGBA channels)
- // pDest: destination for transform data (16-bit interleaved RGBA channels)
- // numQuads: vertical pixel count
- // skipQuadCount: number of quads between row starts
- // columns: number of columns on which to perform transform
- ////////////////////////////////////////////////////////////////////////////
- void vFWVT_4_Quad16_Int_Vertical(vector signed long *pSrc,
- vector signed long *pDest,
- unsigned long numQuads,
- unsigned long skipQuadCount,
- unsigned long columns)
- {
- long length = (numQuads/2);
- long vectorInIndex;
- long skipVectors = skipQuadCount / 2;
- long columnPairCount = columns / 2;
- long columnPairIndex;
-
- vector signed short *pSrcInput;
- vector signed long *pLoOutput;
- vector signed long *pHiOutput;
-
- // vectors that contain wavelet coefficients
- vector signed short vMUL0 = (vector signed short)(H0, H1, H0, H1,H0, H1,H0, H1);
- vector signed short vMUL1 = (vector signed short)(H2, H3, H2, H3,H2, H3,H2, H3);
- vector signed short vMUL2 = (vector signed short)(H3, -H2,H3, -H2,H3, -H2,H3, -H2);
- vector signed short vMUL3 = (vector signed short)(H1, -H0,H1, -H0,H1, -H0,H1, -H0);
-
- vector signed short vCand1;
- vector signed short vCand2;
- vector signed short vCand3;
- vector signed short vCand4;
-
- vector unsigned char vPermuter1 = (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17);
- vector unsigned char vPermuter2 = (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0a, 0x0b, 0x1a, 0x1b, 0x0c, 0x0d, 0x1c, 0x1d, 0x0e, 0x0f, 0x1e, 0x1f);
-
- vector unsigned long vecEndShift = (vector unsigned long)(INT_WAVELET_SHIFT_AMOUNT);
- vector signed long vecEndAdd = (vector signed long)(1 << (INT_WAVELET_SHIFT_AMOUNT-1));
-
- vector signed short vInVector1, vInVector2, vInVector3, vInVector4;
- vector signed long vResult1, vResult2, vResult3, vResult4;
- vector signed short vInVectorFirst, vInVectorSecond;
-
-
- for (columnPairIndex = 0; columnPairIndex < columnPairCount; columnPairIndex++) {
-
- pSrcInput = ((vector signed short*)pSrc) + columnPairIndex;
- pLoOutput = (vector signed long*)pDest+columnPairIndex;
- pHiOutput = (vector signed long*)pLoOutput + ((skipQuadCount/2) *length);
-
- // read the first four in, and save 1 and 2 for wraparound at end
- vInVector1 = *pSrcInput;
- pSrcInput += skipVectors;
- vInVectorFirst = vInVector1;
-
- vInVector2 = *pSrcInput;
- pSrcInput += skipVectors;
- vInVectorSecond = vInVector2;
-
- vInVector3 = *pSrcInput;
- pSrcInput += skipVectors;
-
- vInVector4 = *pSrcInput;
- pSrcInput += skipVectors;
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1);
- vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1);
- vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2);
- vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, vResult2);
-
- vResult3 = vec_msum(vMUL0, vCand3, (vector signed long)vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand4, vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand3, (vector signed long)vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand4, vResult4);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack results to 16 bits
- vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
- vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
-
- // store results
- *pLoOutput = vResult1;
- pLoOutput += skipVectors;
-
- *pHiOutput = vResult2;
- pHiOutput += skipVectors;
-
- vInVector1 = vInVector3;
- vInVector2 = vInVector4;
-
- vInVector3 = *pSrcInput;
- pSrcInput += skipVectors;
-
- vInVector4 = *pSrcInput;
- pSrcInput += skipVectors;
-
- // loop through remaining pixels
- for (vectorInIndex = 0; vectorInIndex < length-2; vectorInIndex++) {
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1);
- vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1);
- vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2);
- vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, vResult2);
-
- vResult3 = vec_msum(vMUL0, vCand3, (vector signed long)vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand4, vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand3, (vector signed long)vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand4, vResult4);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack results to 16 bits
- vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
- vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
-
- // store results
- *pLoOutput = vResult1;
- pLoOutput += skipVectors;
-
- *pHiOutput = vResult2;
- pHiOutput += skipVectors;
-
- vInVector1 = vInVector3;
- vInVector2 = vInVector4;
-
- vInVector3 = *pSrcInput;
- pSrcInput += skipVectors;
-
- vInVector4 = *pSrcInput;
- pSrcInput += skipVectors;
-
- }
-
- // copy initial vectors for wraparound at end
- vInVector3 = vInVectorFirst;
- vInVector4 = vInVectorSecond;
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1);
- vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1);
- vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2);
- vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, vResult2);
-
- vResult3 = vec_msum(vMUL0, vCand3, (vector signed long)vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand4, vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand3, (vector signed long)vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand4, vResult4);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack results to 16 bits
- vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
- vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
-
- // store results
- *pLoOutput = vResult1;
- *pHiOutput = vResult2;
- }
- }
-
-
- ////////////////////////////////////////////////////////////////////////////
- // vFWVT_4_Quad16_Int
- //
- // performs a forward horizontal wavelet transform
- //
- // pSrc: source pixels (16-bit interleaved RGBA channels)
- // pDest: destination for transform data (16-bit interleaved RGBA channels)
- // numQuads: horizontal pixel count
- // skipQuads: number of quads between row starts
- // numRows: number of rows on which to perform transform
- ////////////////////////////////////////////////////////////////////////////
- void vFWVT_4_Quad16_Int( vector signed long *pSrc,
- vector signed long *pDest,
- unsigned long numQuads,
- unsigned long skipQuads,
- unsigned long numRows)
- {
- long vectorInIndex;
- long rowIndex;
-
- vector signed short *pSrcInput;
- vector signed long *pLoOutput;
- vector signed long *pHiOutput;
-
- // vectors that contain wavelet coefficients
- vector signed short vMUL0 = (vector signed short)(H0, H1, H0, H1,H0, H1,H0, H1);
- vector signed short vMUL1 = (vector signed short)(H2, H3, H2, H3,H2, H3,H2, H3);
- vector signed short vMUL2 = (vector signed short)(H3, -H2,H3, -H2,H3, -H2,H3, -H2);
- vector signed short vMUL3 = (vector signed short)(H1, -H0,H1, -H0,H1, -H0,H1, -H0);
-
- vector signed short vCand1;
- vector signed short vCand2;
- vector signed short vCand3;
-
- vector unsigned char vPermuter = (vector unsigned char)(0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f);
-
- vector unsigned long vecEndShift = (vector unsigned long)(INT_WAVELET_SHIFT_AMOUNT);
- vector signed long vecEndAdd = (vector signed long)(1 << (INT_WAVELET_SHIFT_AMOUNT-1));
-
- vector signed short vInVector1, vInVector2, vInVector3;
- vector signed long vResult1, vResult2, vResult3, vResult4;
- vector signed short vInVectorFirst;
-
- for (rowIndex = 0; rowIndex < numRows; rowIndex++) {
-
- pSrcInput = (vector signed short*)pSrc+(rowIndex*(skipQuads/2));
- pLoOutput = (vector signed long*)pDest+(rowIndex*(skipQuads/2));
- pHiOutput = pLoOutput + (numQuads/4);
-
- // read the first four in, and save 1 for wraparound at end
- vInVector1 = *pSrcInput;
- pSrcInput += 1;
- vInVectorFirst = vInVector1;
-
- vInVector2 = *pSrcInput;
- pSrcInput += 1;
-
- vInVector3 = *pSrcInput;
- pSrcInput += 1;
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vInVector1, vInVector1, vPermuter);
- vCand2 = vec_perm(vInVector2, vInVector2, vPermuter);
- vCand3 = vec_perm(vInVector3, vInVector3, vPermuter);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, vResult2);
-
- vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand3, vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand3, vResult4);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack results to 16 bits
- vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
- vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
-
- // store results
- *pLoOutput++ = vResult1;
- *pHiOutput++ = vResult2;
-
- vInVector1 = vInVector3;
-
- // loop through remaining pixels
- for (vectorInIndex = 0; vectorInIndex < (numQuads-8)/4; vectorInIndex++) {
-
- vInVector2 = *pSrcInput;
- pSrcInput += 1;
-
- vInVector3 = *pSrcInput;
- pSrcInput += 1;
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vInVector1, vInVector1, vPermuter);
- vCand2 = vec_perm(vInVector2, vInVector2, vPermuter);
- vCand3 = vec_perm(vInVector3, vInVector3, vPermuter);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, vResult2);
-
- vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand3, vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand3, vResult4);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack results to 16 bits
- vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
- vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
-
- // store results
- *pLoOutput++ = vResult1;
- *pHiOutput++ = vResult2;
-
- vInVector1 = vInVector3;
- }
-
- vInVector2 = *pSrcInput;
- pSrcInput += 1;
-
- vInVector3 = vInVectorFirst;
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vInVector1, vInVector1, vPermuter);
- vCand2 = vec_perm(vInVector2, vInVector2, vPermuter);
- vCand3 = vec_perm(vInVector3, vInVector3, vPermuter);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, vResult2);
-
- vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand3, vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand3, vResult4);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack results to 16 bits
- vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
- vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
-
- // store results
- *pLoOutput = vResult1;
- *pHiOutput = vResult2;
-
- }
- }
-
- ////////////////////////////////////////////////////////////////////////////
- // vFWVT_4_Quad16_Int_ExpandFrom8
- //
- // performs a forward horizontal wavelet transform on 8 bit per channel
- // input data (RGBA interleaved), resulting in 16-bit output data.
- //
- // pSrc: source pixels (8-bit interleaved RGBA channels)
- // pDest: destination for transform data (16-bit interleaved RGBA channels)
- // numQuads: horizontal pixel count
- // skipQuads: number of quads between row starts
- // numRows: number of rows on which to perform transform
- ////////////////////////////////////////////////////////////////////////////
- void vFWVT_4_Quad16_Int_ExpandFrom8( vector unsigned char *pSrc,
- vector signed long *pDest,
- unsigned long numQuads,
- unsigned long skipQuads,
- unsigned long numRows)
- {
- long vectorInIndex;
- long rowIndex;
-
- vector unsigned char *pSrcInput;
- vector signed long *pLoOutput;
- vector signed long *pHiOutput;
-
- vector signed short vMUL0 = (vector signed short)(H0, H1, H0, H1,H0, H1,H0, H1);
- vector signed short vMUL1 = (vector signed short)(H2, H3, H2, H3,H2, H3,H2, H3);
- vector signed short vMUL2 = (vector signed short)(H3, -H2,H3, -H2,H3, -H2,H3, -H2);
- vector signed short vMUL3 = (vector signed short)(H1, -H0,H1, -H0,H1, -H0,H1, -H0);
-
- vector signed short vCand1;
- vector signed short vCand2;
- vector signed short vCand3;
-
- vector unsigned char vPermuter = (vector unsigned char)(0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f);
-
- vector unsigned long vecEndShift = (vector unsigned long)(INT_WAVELET_SHIFT_AMOUNT);
- vector signed long vecEndAdd = (vector signed long)(1 << (INT_WAVELET_SHIFT_AMOUNT-1));
-
- vector unsigned char vInVector1, vInVector2;
- vector signed long vResult1, vResult2, vResult3, vResult4;
- vector unsigned char vInVectorFirst;
-
- vector unsigned long vZero = (vector unsigned long)(0);
-
- vector signed short vExpanded1, vExpanded2, vExpanded3;
-
-
- for (rowIndex = 0; rowIndex < numRows; rowIndex++) {
-
-
- pSrcInput = pSrc+(rowIndex*(skipQuads/4));
- pLoOutput = (vector signed long*)pDest+(rowIndex*(skipQuads/2));
- pHiOutput = pLoOutput + (numQuads/4);
-
- // read the first two in, and save 1 for wraparound at end
- vInVector1 = *pSrcInput;
- pSrcInput += 1;
- vInVectorFirst = vInVector1;
-
- vInVector2 = *pSrcInput;
- pSrcInput += 1;
-
- // expand input vectors from 8 to 16 bits
- vExpanded1 = (vector signed short)vec_mergeh((vector unsigned char)vZero, vInVector1);
- vExpanded2 = (vector signed short)vec_mergel((vector unsigned char)vZero, vInVector1);
- vExpanded3 = (vector signed short)vec_mergeh((vector unsigned char)vZero, vInVector2);
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vExpanded1, vExpanded1, vPermuter);
- vCand2 = vec_perm(vExpanded2, vExpanded2, vPermuter);
- vCand3 = vec_perm(vExpanded3, vExpanded3, vPermuter);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, vResult2);
-
- vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand3, vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand3, vResult4);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack results to 16 bits
- vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
- vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
-
- // store results
- *pLoOutput++ = vResult1;
- *pHiOutput++ = vResult2;
-
- vCand1 = vCand3;
- vExpanded1 = (vector signed short)vec_mergel((vector unsigned char)vZero, vInVector2);
- vCand2 = vec_perm(vExpanded1, vExpanded1, vPermuter);
-
- // loop through remaining pixels
- for (vectorInIndex = 0; vectorInIndex < (numQuads/4)-2; vectorInIndex++) {
-
- vInVector1 = *pSrcInput++;
-
- vExpanded1 = (vector signed short)vec_mergeh((vector unsigned char)vZero, vInVector1);
- vCand3 = vec_perm(vExpanded1, vExpanded1, vPermuter);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, vResult2);
-
- vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand3, vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand3, vResult4);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack results to 16 bits
- vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
- vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
-
- // store results
- *pLoOutput++ = vResult1;
- *pHiOutput++ = vResult2;
-
- vCand1 = vCand3;
- vExpanded1 = (vector signed short)vec_mergel((vector unsigned char)vZero, vInVector1);
- vCand2 = vec_perm(vExpanded1, vExpanded1, vPermuter);
-
- }
-
- vInVector1 = vInVectorFirst;
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vExpanded1 = (vector signed short)vec_mergeh((vector unsigned char)vZero, vInVector1);
- vCand3 = vec_perm(vExpanded1, vExpanded1, vPermuter);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, vResult2);
-
- vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand3, vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand3, vResult4);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack results to 16 bits
- vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
- vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
-
- // store results
- *pLoOutput = vResult1;
- *pHiOutput = vResult2;
-
- }
- }
-
- ////////////////////////////////////////////////////////////////////////////
- // vFWVT_4_Quad16_2DInt
- //
- // performs a two-dimensional forward wavelet on an image. Performs the
- // number of transforms specified by depth.
- //
- // pSrc: source pixels (8-bit interleaved RGBA channels)
- // pDst: destination for transform data (16-bit interleaved RGBA channels)
- // pTemp: temporary image store (16-bit interleaved RGBA channels, full image size)
- // x, y: dimensions of image
- // rowQuads: number of RGBA quads between row starts
- // depth: depth of wavelet transforms to perform
- //
- // *NOTE* there is a limit of 3 for the wavelet depth
- // due to implementation limitations of the current
- // vector wavelet. If a depth > 3 is used, then the
- // intermediate calculations will overflow the 16-bit
- // vector elements.
- //
- ////////////////////////////////////////////////////////////////////////////
-
- void vFWVT_4_Quad16_2DInt(long *pSrc,
- long *pDst,
- long *pTemp,
- unsigned long x,
- unsigned long y,
- unsigned long rowQuads,
- unsigned long depth)
- {
- unsigned long depthIndex;
- unsigned long currentDepthHeight;
- unsigned long currentDepthWidth;
-
-
- currentDepthHeight = y;
- currentDepthWidth = x;
-
- // perform first forward horizontal transform, and
- // simultaneously expand data from 8 to 16 bits.
- vFWVT_4_Quad16_Int_ExpandFrom8( (vector unsigned char*)pSrc,
- (vector signed long*)pTemp,
- currentDepthWidth,
- rowQuads,
- currentDepthHeight
- );
-
- // perform first forward vertical transform
- vFWVT_4_Quad16_Int_Vertical((vector signed long*)pTemp,
- (vector signed long*)pDst,
- currentDepthHeight,
- rowQuads,
- currentDepthWidth);
-
-
- // perform remaining depth transforms
- for (depthIndex = 1; depthIndex < depth; depthIndex++) {
- currentDepthHeight >>= 1;
- currentDepthWidth >>= 1;
-
- // do horizontal transform
- vFWVT_4_Quad16_Int( (vector signed long*)pDst,
- (vector signed long*)pTemp,
- currentDepthWidth,
- rowQuads,
- currentDepthHeight
- );
-
- // do vertical transform
- vFWVT_4_Quad16_Int_Vertical((vector signed long*)pTemp,
- (vector signed long*)pDst,
- currentDepthHeight,
- rowQuads,
- currentDepthWidth);
-
-
-
- }
- }
-
- #pragma mark -
-
- #pragma mark I N V E R S E W A V E L E T
-
- ////////////////////////////////////////////////////////////////////////////
- // vIFWVT_4_Quad16_Int_Vertical
- //
- // performs an inverse vertical wavelet transform
- //
- // pSrc: source pixels (16-bit interleaved RGBA channels)
- // pDest: destination for transform data (16-bit interleaved RGBA channels)
- // numQuads: vertical pixel count
- // skipQuadCount: number of quads between row starts
- // columns: number of columns on which to perform transform
- ////////////////////////////////////////////////////////////////////////////
-
- void vIFWVT_4_Quad16_Int_Vertical( vector signed long *pSrc,
- vector signed long *pDest,
- unsigned long numQuads,
- unsigned long skipQuadCount,
- unsigned long columns)
- {
- long length = (numQuads/2);
- long vectorIndex;
- long skipVectors = skipQuadCount / 2;
- long columnIndex;
- long columnCount = columns / 2;
-
- vector signed long *pColumnOut;
- vector signed short *pSrcHi, *pSrcLo;
- vector signed short *pColStart;
-
- vector signed short vMUL0 = (vector signed short)(H1, H3, H1, H3, H1, H3, H1, H3);
- vector signed short vMUL1 = (vector signed short)(-H2, -H0, -H2, -H0, -H2, -H0, -H2, -H0);
- vector signed short vMUL2 = (vector signed short)(H0, H2, H0, H2, H0, H2, H0, H2);
- vector signed short vMUL3 = (vector signed short)(H3, H1, H3, H1, H3, H1, H3, H1);
-
- vector signed short vCand1;
- vector signed short vCand2;
- vector signed short vCand3;
- vector signed short vCand4;
-
- vector unsigned char vPermuter1 = (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17);
- vector unsigned char vPermuter2 = (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0a, 0x0b, 0x1a, 0x1b, 0x0c, 0x0d, 0x1c, 0x1d, 0x0e, 0x0f, 0x1e, 0x1f);
-
- vector unsigned long vecEndShift = (vector unsigned long)(INVERSE_WAVELET_SHIFT_AMOUNT);
- vector signed long vecEndAdd = (vector signed long)(1 << (INVERSE_WAVELET_SHIFT_AMOUNT-1));
-
- vector signed short vInVector1, vInVector2, vInVector3, vInVector4;
- vector signed long vResult1, vResult2, vResult3, vResult4;
-
- vector unsigned long vecOnes = (vector unsigned long)(3);
-
- for (columnIndex = 0; columnIndex < columnCount; columnIndex++) {
-
- pColumnOut = pDest + columnIndex;
- pColStart = (vector signed short*)pSrc+columnIndex;
- pSrcLo = pColStart;
- pSrcHi = pSrcLo + (skipVectors*numQuads)/2;
-
- // read in input vectors
- vInVector1 = *pSrcLo;
- vInVector2 = *(pSrcHi-skipVectors);
- vInVector3 = *pSrcHi;
- vInVector4 = *(pSrcLo+((numQuads-1)*(skipVectors)));
-
- pSrcHi+=skipVectors;
- pSrcLo+=skipVectors;
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1);
- vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, (vector signed long)vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, (vector signed long)vResult2);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2);
- vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult3 = vec_msum(vMUL0, vCand3, vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand4, (vector signed long)vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand3, vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand4, (vector signed long)vResult4);
-
- // do end shift to perform divide
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack results to 16 bits and store results
- *pColumnOut = (vector signed long)vec_pack(vResult2, vResult4);
- pColumnOut += skipVectors;
-
- *pColumnOut = (vector signed long)vec_pack(vResult1, vResult3);
- pColumnOut += skipVectors;
-
- for (vectorIndex = 0; vectorIndex < length-1; vectorIndex++) {
-
- // read in input vectors
- vInVector2 = *pSrcLo;
- vInVector4 = *pSrcHi;
-
- pSrcHi+=skipVectors;
- pSrcLo+=skipVectors;
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vInVector2, vInVector1, vPermuter1);
- vCand2 = vec_perm(vInVector4, vInVector3, vPermuter1);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL2, vCand1, vecEndAdd);
- vResult1 = vec_msum(vMUL3, vCand2, (vector signed long)vResult1);
-
- vResult2 = vec_msum(vMUL0, vCand1, vecEndAdd);
- vResult2 = vec_msum(vMUL1, vCand2, (vector signed long)vResult2);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand3 = vec_perm(vInVector2, vInVector1, vPermuter2);
- vCand4 = vec_perm(vInVector4, vInVector3, vPermuter2);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult3 = vec_msum(vMUL2, vCand3, vecEndAdd);
- vResult3 = vec_msum(vMUL3, vCand4, (vector signed long)vResult3);
-
- vResult4 = vec_msum(vMUL0, vCand3, vecEndAdd);
- vResult4 = vec_msum(vMUL1, vCand4, (vector signed long)vResult4);
-
- // do end shift to perform divide
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack results to 16 bits and store results
- *pColumnOut = (vector signed long)vec_pack(vResult1, vResult3);
- pColumnOut += skipVectors;
-
- *pColumnOut = (vector signed long)vec_pack(vResult2, vResult4);
- pColumnOut += skipVectors;
-
- vInVector1 = vInVector2;
- vInVector3 = vInVector4;
- }
-
- }
- }
-
-
- ////////////////////////////////////////////////////////////////////////////
- // vIFWVT_4_Quad16_Int_PackTo8
- //
- // performs an inverse horizontal wavelet transform, and packs final
- // data from 16 bits to 8 bits per channel.
- //
- // pSrc: source pixels (16-bit interleaved RGBA channels)
- // pDest: destination for transform data (8-bit interleaved RGBA channels)
- // numQuads: horizontal pixel count
- // skipQuads: number of quads between row starts
- // numRows: number of rows on which to perform transform
- ////////////////////////////////////////////////////////////////////////////
-
- static void vIFWVT_4_Quad16_Int_PackTo8( vector signed long *pSrc,
- vector signed long *pDest,
- unsigned long numQuads,
- unsigned long skipQuads,
- unsigned long numRows)
- {
- long length = (numQuads/2);
- long vectorIndex;
- long rowIndex;
-
- vector signed long *pRowDest;
- vector signed short *pSrcHi, *pSrcLo;
- vector signed short *pRowStart;
-
-
- vector signed short vMUL0 = (vector signed short)(H3, H1, H3, H1, H3, H1, H3, H1);
- vector signed short vMUL1 = (vector signed short)(-H0, -H2, -H0, -H2, -H0, -H2, -H0, -H2);
- vector signed short vMUL2 = (vector signed short)(H2, H0, H2, H0, H2, H0, H2, H0);
- vector signed short vMUL3 = (vector signed short)(H1, H3, H1, H3, H1, H3, H1, H3);
-
- vector signed short vCand1;
- vector signed short vCand2;
- vector signed short vCand3;
- vector signed short vCand4;
-
- vector unsigned char vPermuter1_first = (vector unsigned char)(0x08, 0x09, 0x10, 0x11, 0x0a, 0x0b, 0x12, 0x13, 0x0c, 0x0d, 0x14, 0x15, 0x0e, 0x0f, 0x16, 0x17);
- vector unsigned char vPermuter2_first = (vector unsigned char)(0x10, 0x11, 0x18, 0x19, 0x12, 0x13, 0x1a, 0x1b, 0x14, 0x15, 0x1c, 0x1d, 0x16, 0x17, 0x1e, 0x1f);
-
- vector unsigned long vecEndShift = (vector unsigned long)(INVERSE_WAVELET_SHIFT_AMOUNT);
- vector signed long vecEndAdd = (vector signed long)(1 << (INVERSE_WAVELET_SHIFT_AMOUNT-1));
-
- vector signed short vInVector1, vInVector2, vInVector3, vInVector4;
- vector signed long vResult1, vResult2, vResult3, vResult4;
-
- vector unsigned long vZero = (vector unsigned long)(0);
-
- vector unsigned long vecOnes = (vector unsigned long)(3);
-
- vector signed short prepackResultHi, prepackResultLo;
-
- vector signed short vMax = (vector signed short)(0xff);
-
-
- for (rowIndex = 0; rowIndex < numRows; rowIndex++) {
-
- pRowDest = (vector signed long*)pDest + (rowIndex*(skipQuads/4));
- pRowStart = (vector signed short*)pSrc + (rowIndex*(skipQuads/2));
- pSrcLo = pRowStart;
- pSrcHi = pSrcLo + (numQuads/4);
-
- // read in input vectors
- vInVector1 = *pSrcLo;
- vInVector2 = *(pSrcHi-1);
- vInVector3 = *pSrcHi;
- vInVector4 = *(pSrcLo+((numQuads-2)/2));
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vInVector2, vInVector1, vPermuter1_first);
- vCand2 = vec_perm(vInVector4, vInVector3, vPermuter1_first);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, (vector signed long)vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, (vector signed long)vResult2);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
-
- // pack to 16, and ensure max/min of 0 and 255 before pack to 8 bits
- prepackResultHi = (vector signed short)vec_pack(vResult2, vResult1);
-
- prepackResultHi = vec_max((vector signed short)vZero, prepackResultHi);
- prepackResultHi = vec_min(vMax, prepackResultHi);
-
- //////////////////////////////////
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand3 = vec_perm(vInVector2, vInVector1, vPermuter2_first);
- vCand4 = vec_perm(vInVector4, vInVector3, vPermuter2_first);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult3 = vec_msum(vMUL0, vCand3, vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand4, (vector signed long)vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand3, vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand4, (vector signed long)vResult4);
-
- // do end shift to perform divide
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack to 16, and ensure max/min of 0 and 255 before pack to 8 bits
- prepackResultLo = (vector signed short)vec_pack(vResult4, vResult3);
-
- prepackResultLo = vec_max((vector signed short)vZero, prepackResultLo);
- prepackResultLo = vec_min(vMax, prepackResultLo);
-
- // pack to 8 bits and store result
- *pRowDest++ = (vector signed long)vec_pack(prepackResultHi, prepackResultLo);
-
- ++pSrcLo;
- ++pSrcHi;
-
- for (vectorIndex = 0; vectorIndex < (length/2)-1; vectorIndex++) {
- // read in input vectors
- vInVector2 = *pSrcLo++;
- vInVector4 = *pSrcHi++;
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1_first);
- vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1_first);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult1 = vec_msum(vMUL0, vCand1, vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, (vector signed long)vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, (vector signed long)vResult2);
-
- // do end shift to perform divide
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
-
- // pack to 16, and ensure max/min of 0 and 255 before pack to 8 bits
- prepackResultHi = (vector signed short)vec_pack(vResult2, vResult1);
-
- prepackResultHi = vec_max((vector signed short)vZero, prepackResultHi);
- prepackResultHi = vec_min(vMax, prepackResultHi);
-
- //////////////////////////////////
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2_first);
- vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2_first);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult3 = vec_msum(vMUL0, vCand3, vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand4, (vector signed long)vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand3, vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand4, (vector signed long)vResult4);
-
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack to 16, and ensure max/min of 0 and 255 before pack to 8 bits
- prepackResultLo = (vector signed short)vec_pack(vResult4, vResult3);
-
- prepackResultLo = vec_max((vector signed short)vZero, prepackResultLo);
- prepackResultLo = vec_min(vMax, prepackResultLo);
-
- // pack to 8 bits and store result
-
- *pRowDest++ = (vector signed long)vec_pack(prepackResultHi, prepackResultLo);
-
- vInVector1 = vInVector2;
- vInVector3 = vInVector4;
-
- }
-
- }
- }
-
-
-
-
- ////////////////////////////////////////////////////////////////////////////
- // vIFWVT_4_Quad16_Int
- //
- // performs an inverse horizontal wavelet transform
- //
- // pSrc: source pixels (16-bit interleaved RGBA channels)
- // pDest: destination for transform data (16-bit interleaved RGBA channels)
- // numQuads: horizontal pixel count
- // skipQuads: number of quads between row starts
- // numRows: number of rows on which to perform transform
- ////////////////////////////////////////////////////////////////////////////
-
- void vIFWVT_4_Quad16_Int( vector signed long *pSrc,
- vector signed long *pDest,
- unsigned long numQuads,
- unsigned long skipQuads,
- unsigned long numRows)
- {
- long length = (numQuads/2);
- long vectorIndex;
- long rowIndex;
-
- vector signed long *pRowDest;
- vector signed short *pSrcHi, *pSrcLo;
- vector signed short *pRowStart;
-
-
- vector signed short vMUL0 = (vector signed short)(H3, H1, H3, H1, H3, H1, H3, H1);
- vector signed short vMUL1 = (vector signed short)(-H0, -H2, -H0, -H2, -H0, -H2, -H0, -H2);
- vector signed short vMUL2 = (vector signed short)(H2, H0, H2, H0, H2, H0, H2, H0);
- vector signed short vMUL3 = (vector signed short)(H1, H3, H1, H3, H1, H3, H1, H3);
-
- vector signed short vCand1;
- vector signed short vCand2;
- vector signed short vCand3;
- vector signed short vCand4;
-
- vector unsigned char vPermuter1_first = (vector unsigned char)(0x08, 0x09, 0x10, 0x11, 0x0a, 0x0b, 0x12, 0x13, 0x0c, 0x0d, 0x14, 0x15, 0x0e, 0x0f, 0x16, 0x17);
- vector unsigned char vPermuter2_first = (vector unsigned char)(0x10, 0x11, 0x18, 0x19, 0x12, 0x13, 0x1a, 0x1b, 0x14, 0x15, 0x1c, 0x1d, 0x16, 0x17, 0x1e, 0x1f);
-
- vector unsigned long vecEndShift = (vector unsigned long)(INVERSE_WAVELET_SHIFT_AMOUNT);
- vector signed long vecEndAdd = (vector signed long)(1 << (INVERSE_WAVELET_SHIFT_AMOUNT-1));
-
- vector signed short vInVector1, vInVector2, vInVector3, vInVector4;
- vector signed long vResult1, vResult2, vResult3, vResult4;
-
- vector unsigned long vecOnes = (vector unsigned long)(3);
-
- for (rowIndex = 0; rowIndex < numRows; rowIndex++) {
-
- pRowDest = (vector signed long*)pDest + (rowIndex*(skipQuads/2));;
- pRowStart = (vector signed short*)pSrc + (rowIndex*(skipQuads/2));
- pSrcLo = pRowStart;
- pSrcHi = pSrcLo + (numQuads/4);
-
- // read in input vectors
- vInVector1 = *pSrcLo;
- vInVector2 = *(pSrcHi-1);
- vInVector3 = *pSrcHi;
- vInVector4 = *(pSrcLo+((numQuads-2)/2));
-
- vCand1 = vec_perm(vInVector2, vInVector1, vPermuter1_first);
- vCand2 = vec_perm(vInVector4, vInVector3, vPermuter1_first);
-
- vResult1 = vec_msum(vMUL0, vCand1, vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, (vector signed long)vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, (vector signed long)vResult2);
-
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
-
- *pRowDest++ = (vector signed long)vec_pack(vResult2, vResult1);
-
- vCand3 = vec_perm(vInVector2, vInVector1, vPermuter2_first);
- vCand4 = vec_perm(vInVector4, vInVector3, vPermuter2_first);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult3 = vec_msum(vMUL0, vCand3, vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand4, (vector signed long)vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand3, vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand4, (vector signed long)vResult4);
-
- // do end shift to perform divide
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- *pRowDest++ = (vector signed long)vec_pack(vResult4, vResult3);
-
- ++pSrcLo;
- ++pSrcHi;
-
- for (vectorIndex = 0; vectorIndex < (length/2)-1; vectorIndex++) {
- // read in input vectors
- vInVector2 = *pSrcLo++;
- vInVector4 = *pSrcHi++;
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1_first);
- vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1_first);
-
- vResult1 = vec_msum(vMUL0, vCand1, vecEndAdd);
- vResult1 = vec_msum(vMUL1, vCand2, (vector signed long)vResult1);
-
- vResult2 = vec_msum(vMUL2, vCand1, vecEndAdd);
- vResult2 = vec_msum(vMUL3, vCand2, (vector signed long)vResult2);
-
- vResult1 = vec_sra(vResult1,vecEndShift);
- vResult2 = vec_sra(vResult2,vecEndShift);
-
- // pack result to 16 bits and store
- *pRowDest++ = (vector signed long)vec_pack(vResult2, vResult1);
-
-
- // set up multiplicand vectors using raw input vectors and permute instruction
- vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2_first);
- vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2_first);
-
- // calculate pixel results by multiplying elements by the wavelet coefficients,
- // with additional add for appropriate rounding in divide
- vResult3 = vec_msum(vMUL0, vCand3, vecEndAdd);
- vResult3 = vec_msum(vMUL1, vCand4, (vector signed long)vResult3);
-
- vResult4 = vec_msum(vMUL2, vCand3, vecEndAdd);
- vResult4 = vec_msum(vMUL3, vCand4, (vector signed long)vResult4);
-
- // do end shift to perform divide
- vResult3 = vec_sra(vResult3,vecEndShift);
- vResult4 = vec_sra(vResult4,vecEndShift);
-
- // pack result to 16 bits and store
- *pRowDest++ = (vector signed long)vec_pack(vResult4, vResult3);
-
- ///////////////////////////////////
-
- vInVector1 = vInVector2;
- vInVector3 = vInVector4;
-
- }
-
- }
- }
-
-
- ////////////////////////////////////////////////////////////////////////////
- // vIFWVT_4_Quad16_2DInt
- //
- // performs a two-dimensional inverse wavelet on an image. Performs the
- // number of transforms specified by depth.
- //
- // pSrc: source pixels (8-bit interleaved RGBA channels)
- // pDst: destination for transform data (16-bit interleaved RGBA channels)
- // pTemp: temporary image store (16-bit interleaved RGBA channels, full image size)
- // x, y: dimensions of image
- // rowQuads: number of RGBA quads between row starts
- // depth: depth of wavelet transforms to perform
- //
- // *NOTE* there is a limit of 3 for the wavelet depth
- // due to implementation limitations of the current
- // vector wavelet. If a depth > 3 is used, then the
- // intermediate calculations will overflow the 16-bit
- // vector elements.
- //
- // *NOTE* transform may destroy source data, as it
- // uses it as temp storage for an intermediate transform
- ////////////////////////////////////////////////////////////////////////////
-
-
- void vIFWVT_4_Quad16_2DInt(long *pSrc,
- long *pDst,
- long *pTemp,
- unsigned long x,
- unsigned long y,
- unsigned long rowQuads,
- unsigned long depth)
- {
- unsigned long depthIndex;
- unsigned long currentDepthHeight;
- unsigned long currentDepthWidth;
-
-
- currentDepthHeight = y >> (depth);
- currentDepthWidth = x >> (depth);
-
- // if depth is 1, perform transform directly to dest
- if (depth == 1) {
- currentDepthHeight <<= 1;
- currentDepthWidth <<= 1;
-
- // to vertical from source to temp
- vIFWVT_4_Quad16_Int_Vertical((vector signed long*)pSrc,
- (vector signed long*)pTemp,
- currentDepthHeight,
- rowQuads,
- currentDepthWidth);
-
- // to horizontal from temp to dest, pack to 8 bits
- vIFWVT_4_Quad16_Int_PackTo8((vector signed long*)pTemp,
- (vector signed long*)pDst,
- currentDepthWidth,
- rowQuads,
- currentDepthHeight
- );
-
-
-
- } else if (depth) {
-
- // loop through all but last depth
- for (depthIndex = 0; depthIndex < depth-1; depthIndex++) {
- currentDepthHeight <<= 1;
- currentDepthWidth <<= 1;
-
- // do vertical transform from source to temp
- vIFWVT_4_Quad16_Int_Vertical((vector signed long*)pSrc,
- (vector signed long*)pTemp,
- currentDepthHeight,
- rowQuads,
- currentDepthWidth);
-
-
- // do horizontal transform from temp to src
- vIFWVT_4_Quad16_Int((vector signed long*)pTemp,
- (vector signed long*)pSrc,
- currentDepthWidth,
- rowQuads,
- currentDepthHeight
- );
-
-
-
- }
-
- currentDepthHeight <<= 1;
- currentDepthWidth <<= 1;
-
- // to vertical from source to temp
- vIFWVT_4_Quad16_Int_Vertical((vector signed long*)pSrc,
- (vector signed long*)pTemp,
- currentDepthHeight,
- rowQuads,
- currentDepthWidth);
-
-
- // to horizontal from temp to dest, pack to 8 bits
- vIFWVT_4_Quad16_Int_PackTo8((vector signed long*)pTemp,
- (vector signed long*)pDst,
- currentDepthWidth,
- rowQuads,
- currentDepthHeight
- );
-
-
- }
-
- }
-
-